Meta Data Exploration

Clayton Miller -- miller.clayton@gmail.com

This notebook gives an overview of the buildings in this study


In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [2]:
%matplotlib inline

In [9]:
repos_path = "/Users/nus/temporal-features-for-nonres-buildings-library/"

In [10]:
meta = pd.read_csv(os.path.join(repos_path,"data/raw/meta_open_withclassificationobjectives.csv"), index_col='uid', parse_dates=["datastart","dataend"], dayfirst=True)

In [11]:
meta.info()


<class 'pandas.core.frame.DataFrame'>
Index: 507 entries, PrimClass_Everett to UnivLab_Aoife
Data columns (total 21 columns):
dataend                   507 non-null datetime64[ns]
datastart                 507 non-null datetime64[ns]
energystarscore           26 non-null float64
heatingtype               124 non-null object
industry                  507 non-null object
mainheatingtype           122 non-null object
numberoffloors            124 non-null float64
occupants                 105 non-null float64
primaryspaceusage         507 non-null object
rating                    131 non-null object
sqft                      507 non-null float64
sqm                       507 non-null float64
subindustry               507 non-null object
timezone                  507 non-null object
yearbuilt                 313 non-null object
nickname                  507 non-null object
primaryspaceuse_abbrev    507 non-null object
newweatherfilename        507 non-null object
dailymeancons             507 non-null float64
usagecategory             507 non-null object
operationsgroup           507 non-null object
dtypes: datetime64[ns](2), float64(6), object(13)
memory usage: 87.1+ KB

In [12]:
sns.set_style("whitegrid")

In [18]:
meta.primaryspaceusage.value_counts()


Out[18]:
Office                         156
Primary/Secondary Classroom    105
College Laboratory              95
College Classroom               81
Dormitory                       70
Name: primaryspaceusage, dtype: int64

In [19]:
meta.usagecategory.value_counts()


Out[19]:
Intermediate    173
Low             169
High            165
Name: usagecategory, dtype: int64

In [13]:
meta.info()#[['industry','timezone']].head()


<class 'pandas.core.frame.DataFrame'>
Index: 507 entries, PrimClass_Everett to UnivLab_Aoife
Data columns (total 21 columns):
dataend                   507 non-null datetime64[ns]
datastart                 507 non-null datetime64[ns]
energystarscore           26 non-null float64
heatingtype               124 non-null object
industry                  507 non-null object
mainheatingtype           122 non-null object
numberoffloors            124 non-null float64
occupants                 105 non-null float64
primaryspaceusage         507 non-null object
rating                    131 non-null object
sqft                      507 non-null float64
sqm                       507 non-null float64
subindustry               507 non-null object
timezone                  507 non-null object
yearbuilt                 313 non-null object
nickname                  507 non-null object
primaryspaceuse_abbrev    507 non-null object
newweatherfilename        507 non-null object
dailymeancons             507 non-null float64
usagecategory             507 non-null object
operationsgroup           507 non-null object
dtypes: datetime64[ns](2), float64(6), object(13)
memory usage: 87.1+ KB

In [14]:
meta.head()


Out[14]:
dataend datastart energystarscore heatingtype industry mainheatingtype numberoffloors occupants primaryspaceusage rating ... sqm subindustry timezone yearbuilt nickname primaryspaceuse_abbrev newweatherfilename dailymeancons usagecategory operationsgroup
uid
PrimClass_Everett 2012-12-31 23:00:00 2012-01-01 NaN NaN Education NaN NaN NaN Primary/Secondary Classroom NaN ... 9804.053590 Primary/Secondary School America/New_York NaN Everett PrimClass weather12.csv 0.005935 Low Misc
UnivClass_Clifford 2015-12-31 23:00:00 2015-01-01 NaN NaN Education NaN NaN NaN College Classroom NaN ... 5292.591007 College/University America/New_York 1967 Clifford UnivClass weather2.csv 0.006464 Low Group3
Office_Elizabeth 2012-12-31 23:00:00 2012-01-01 NaN NaN Commercial Property NaN NaN NaN Office NaN ... 27373.961850 Commercial Real Estate America/Los_Angeles NaN Elizabeth Office weather22.csv 0.007931 Low Misc
Office_Ellie 2012-12-31 23:00:00 2012-01-01 NaN NaN Commercial Property NaN NaN NaN Office NaN ... 46127.918850 Bank/Financial Services America/Los_Angeles NaN Ellie Office weather28.csv 0.008237 Low Misc
PrimClass_Elisabeth 2012-12-31 23:00:00 2012-01-01 NaN NaN Education NaN NaN NaN Primary/Secondary Classroom NaN ... 21652.158990 Primary/Secondary School America/New_York NaN Elisabeth PrimClass weather23.csv 0.008597 Low Misc

5 rows × 21 columns


In [15]:
df = pd.pivot_table(meta.reset_index(), index='timezone', columns='subindustry', values='uid', aggfunc='count')

In [16]:
df


Out[16]:
subindustry Bank/Financial Services Business Services City, County, State College/University Commercial Real Estate Corporate Office Other Government Buildings Primary/Secondary School Social Services
timezone
America/Chicago NaN NaN NaN 68.0 NaN NaN NaN 7.0 NaN
America/Denver NaN NaN NaN NaN NaN NaN NaN 3.0 NaN
America/Los_Angeles 2.0 1.0 NaN 18.0 1.0 NaN NaN NaN NaN
America/New_York NaN 2.0 NaN 129.0 3.0 2.0 NaN 15.0 NaN
America/Phoenix NaN NaN NaN 96.0 NaN NaN NaN NaN NaN
Asia/Singapore NaN NaN NaN NaN NaN NaN NaN 5.0 NaN
Europe/London NaN NaN 13.0 43.0 NaN NaN 12.0 74.0 1.0
Europe/Zurich NaN NaN NaN 12.0 NaN NaN NaN NaN NaN

In [17]:
# crashes = sns.load_dataset("car_crashes").sort_values("total", ascending=False)
location = pd.DataFrame(meta.timezone.value_counts()).reset_index()

location


Out[17]:
index timezone
0 America/New_York 151
1 Europe/London 143
2 America/Phoenix 96
3 America/Chicago 75
4 America/Los_Angeles 22
5 Europe/Zurich 12
6 Asia/Singapore 5
7 America/Denver 3

In [12]:
def createbarchart(df, column, ylabel, color, filelabel):
    # Initialize the matplotlib figure
    #sns.set_context('poster', font_scale=1)    
    location = pd.DataFrame(df[column].value_counts()).reset_index()

    f, ax = plt.subplots(figsize=(9, 5))

    sns.set_color_codes("pastel")
    sns.barplot(x=column, y="index", data=location[:15], color=color)

    # Add a legend and informative axis label
    ax.legend(ncol=2, loc="lower right", frameon=True)
    ax.set(ylabel=ylabel,
           xlabel="Number of Buildings")
    sns.despine(left=True, bottom=True)
    plt.subplots_adjust(left=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(repos_path,"reports/figures/metadataoverview/"+filelabel+".png"))

In [13]:
location[:15]


Out[13]:
index timezone
0 America/New_York 151
1 Europe/London 143
2 America/Phoenix 96
3 America/Chicago 75
4 America/Los_Angeles 22
5 Europe/Zurich 12
6 Asia/Singapore 5
7 America/Denver 3

In [14]:
meta["string_starttime"] = meta.datastart.apply(lambda x: str(x.date()))

In [15]:
sns.set_context("paper", font_scale=2)                                                  
createbarchart(meta, "string_starttime", "Data Starting Time", "m","starttimesbar")


/Users/Clayton/anaconda/lib/python2.7/site-packages/matplotlib/axes/_axes.py:519: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

In [16]:
#sns.set_style("white")
createbarchart(meta, "timezone", "Time Zones", "b","timezonesbar")



In [17]:
createbarchart(meta, "industry", "Industry", "r", "bar_industry")



In [18]:
createbarchart(meta, "subindustry", "Sub-Industry", "g","bar_subindustry")



In [19]:
createbarchart(meta, "primaryspaceusage", "Primary Use", "y", "bar_primaryspaceuse")


Create 4 bar charts in a set of panels


In [20]:
import numpy as np

In [21]:
# Simple data to display in various forms
x = np.linspace(0, 2 * np.pi, 400)
y = np.sin(x ** 2)


# row and column sharing
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex='col', sharey='row')
ax1.plot(x, y)
ax1.set_title('Sharing x per column, y per row')
ax2.scatter(x, y)
ax3.scatter(x, 2 * y ** 2 - 1, color='r')
ax4.plot(x, 2 * y ** 2 - 1, color='r')


Out[21]:
[<matplotlib.lines.Line2D at 0x11d94f390>]

In [22]:
sns.set_color_codes("pastel")
sns.set_context(font_scale=1)

In [23]:
import matplotlib.gridspec as gridspec

In [24]:
plt.figure(figsize=(18,10))
gs = gridspec.GridSpec(2, 8)
#f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(20, 9))#
# ax = plt.subplot2grid((2,2),(0, 0))

location = pd.DataFrame(meta["timezone"].value_counts()).reset_index()
ax1 = plt.subplot(gs[0,0:4])
ax1 = sns.barplot(x="timezone", y="index", data=location[:15], color="b")
ax1.legend(ncol=2, loc="lower right", frameon=True)
ax1.set(ylabel="Time Zones", xlabel="Number of Buildings")

location = pd.DataFrame(meta["industry"].value_counts()).reset_index()
ax2 = plt.subplot(gs[0,4:])
ax2 = sns.barplot(x="industry", y="index", data=location[:15], color="g")
ax2.legend(ncol=2, loc="lower right", frameon=True)
ax2.set(ylabel="Industry", xlabel="Number of Buildings")

location = pd.DataFrame(meta["subindustry"].value_counts()).reset_index()
ax3 = plt.subplot(gs[1,0:4])
ax3 = sns.barplot(x="subindustry", y="index", data=location[:15], color="r")
ax3.legend(ncol=2, loc="lower right", frameon=True)
ax3.set(ylabel="Sub-Industry", xlabel="Number of Buildings")

location = pd.DataFrame(meta["primaryspaceusage"].value_counts()).reset_index()
ax4 = plt.subplot(gs[1,4:])
ax4 = sns.barplot(x="primaryspaceusage", y="index", data=location[:15], color="y")
ax4.legend(ncol=2, loc="lower right", frameon=True)
ax4.set(ylabel="Primary Use", xlabel="Number of Buildings")

sns.despine(left=True, bottom=True)
plt.subplots_adjust(left=0.3)
plt.tight_layout()
plt.savefig(os.path.join(repos_path,"reports/figures/metadataoverview/allbars.pdf"))



In [ ]:


In [ ]: